Prg 1 - Develop a program to create histograms for all numerical features and analyze the distribution of each feature. Generate box plots for all numerical features and identify any outliers. Use California Housing dataset.¶
In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
In [45]:
df = pd.read_csv(r"./housing.csv")
In [46]:
df.head()
Out[46]:
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
In [47]:
df.shape
Out[47]:
(20640, 10)
In [48]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 20640 non-null float64 1 latitude 20640 non-null float64 2 housing_median_age 20640 non-null float64 3 total_rooms 20640 non-null float64 4 total_bedrooms 20433 non-null float64 5 population 20640 non-null float64 6 households 20640 non-null float64 7 median_income 20640 non-null float64 8 median_house_value 20640 non-null float64 9 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
In [49]:
df.isnull().sum()
Out[49]:
longitude 0 latitude 0 housing_median_age 0 total_rooms 0 total_bedrooms 207 population 0 households 0 median_income 0 median_house_value 0 ocean_proximity 0 dtype: int64
In [50]:
df.duplicated().sum()
Out[50]:
0
In [51]:
df['total_bedrooms'].median()
Out[51]:
435.0
In [52]:
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
In [53]:
for i in df.iloc[:,2:7]:
df[i] = df[i].astype('int')
In [54]:
df.head()
Out[54]:
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41 | 880 | 129 | 322 | 126 | 8.3252 | 452600.0 | NEAR BAY |
| 1 | -122.22 | 37.86 | 21 | 7099 | 1106 | 2401 | 1138 | 8.3014 | 358500.0 | NEAR BAY |
| 2 | -122.24 | 37.85 | 52 | 1467 | 190 | 496 | 177 | 7.2574 | 352100.0 | NEAR BAY |
| 3 | -122.25 | 37.85 | 52 | 1274 | 235 | 558 | 219 | 5.6431 | 341300.0 | NEAR BAY |
| 4 | -122.25 | 37.85 | 52 | 1627 | 280 | 565 | 259 | 3.8462 | 342200.0 | NEAR BAY |
In [55]:
df.describe().T
Out[55]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| longitude | 20640.0 | -119.569704 | 2.003532 | -124.3500 | -121.8000 | -118.4900 | -118.01000 | -114.3100 |
| latitude | 20640.0 | 35.631861 | 2.135952 | 32.5400 | 33.9300 | 34.2600 | 37.71000 | 41.9500 |
| housing_median_age | 20640.0 | 28.639486 | 12.585558 | 1.0000 | 18.0000 | 29.0000 | 37.00000 | 52.0000 |
| total_rooms | 20640.0 | 2635.763081 | 2181.615252 | 2.0000 | 1447.7500 | 2127.0000 | 3148.00000 | 39320.0000 |
| total_bedrooms | 20640.0 | 536.838857 | 419.391878 | 1.0000 | 297.0000 | 435.0000 | 643.25000 | 6445.0000 |
| population | 20640.0 | 1425.476744 | 1132.462122 | 3.0000 | 787.0000 | 1166.0000 | 1725.00000 | 35682.0000 |
| households | 20640.0 | 499.539680 | 382.329753 | 1.0000 | 280.0000 | 409.0000 | 605.00000 | 6082.0000 |
| median_income | 20640.0 | 3.870671 | 1.899822 | 0.4999 | 2.5634 | 3.5348 | 4.74325 | 15.0001 |
| median_house_value | 20640.0 | 206855.816909 | 115395.615874 | 14999.0000 | 119600.0000 | 179700.0000 | 264725.00000 | 500001.0000 |
In [56]:
Numericals = df.select_dtypes(include=[np.number]).columns
print(Numericals)
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
'total_bedrooms', 'population', 'households', 'median_income',
'median_house_value'],
dtype='object')
In [21]:
for col in Numericals:
plt.figure(figsize=(10,6))
df[col].plot(kind="hist", title=col, bins=60, edgecolor="black")
plt.ylabel("Frequency")
plt.show()
In [57]:
for col in Numericals:
# print(df[col])
plt.figure(figsize=(6,6))
sns.boxplot(data=df[col], color="blue")
plt.title(col)
plt.ylabel(col)
plt.show()
Prg 2 - Develop a program to Compute the correlation matrix to understand the relationships between pairs of features. Visualize the correlation matrix using a heatmap to know which variables have strong positive/negative correlations. Create a pair plot to visualize pairwise relationships between features. Use California Housing dataset.¶
In [325]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
import warnings
warnings.filterwarnings("ignore")
In [326]:
data = fetch_california_housing()
In [327]:
df = pd.DataFrame(data.data, columns=data.feature_names)
In [328]:
df['Target'] = data.target
In [329]:
df.head()
Out[329]:
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | Target | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
| 1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
| 2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
| 3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
| 4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
In [330]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MedInc 20640 non-null float64 1 HouseAge 20640 non-null float64 2 AveRooms 20640 non-null float64 3 AveBedrms 20640 non-null float64 4 Population 20640 non-null float64 5 AveOccup 20640 non-null float64 6 Latitude 20640 non-null float64 7 Longitude 20640 non-null float64 8 Target 20640 non-null float64 dtypes: float64(9) memory usage: 1.4 MB
In [331]:
df.describe()
Out[331]:
| MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | Target | |
|---|---|---|---|---|---|---|---|---|---|
| count | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 | 20640.000000 |
| mean | 3.870671 | 28.639486 | 5.429000 | 1.096675 | 1425.476744 | 3.070655 | 35.631861 | -119.569704 | 2.068558 |
| std | 1.899822 | 12.585558 | 2.474173 | 0.473911 | 1132.462122 | 10.386050 | 2.135952 | 2.003532 | 1.153956 |
| min | 0.499900 | 1.000000 | 0.846154 | 0.333333 | 3.000000 | 0.692308 | 32.540000 | -124.350000 | 0.149990 |
| 25% | 2.563400 | 18.000000 | 4.440716 | 1.006079 | 787.000000 | 2.429741 | 33.930000 | -121.800000 | 1.196000 |
| 50% | 3.534800 | 29.000000 | 5.229129 | 1.048780 | 1166.000000 | 2.818116 | 34.260000 | -118.490000 | 1.797000 |
| 75% | 4.743250 | 37.000000 | 6.052381 | 1.099526 | 1725.000000 | 3.282261 | 37.710000 | -118.010000 | 2.647250 |
| max | 15.000100 | 52.000000 | 141.909091 | 34.066667 | 35682.000000 | 1243.333333 | 41.950000 | -114.310000 | 5.000010 |
In [332]:
df.isnull().sum()
Out[332]:
MedInc 0 HouseAge 0 AveRooms 0 AveBedrms 0 Population 0 AveOccup 0 Latitude 0 Longitude 0 Target 0 dtype: int64
In [333]:
df.duplicated().sum()
Out[333]:
0
In [334]:
plt.figure(figsize=(12,8))
df.hist(figsize=(12,8), bins=30, edgecolor="black")
plt.suptitle("Feature Distribution", fontsize=16)
plt.show()
<Figure size 1200x800 with 0 Axes>
In [336]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.suptitle("Boxplots of Features to Identify Outliers", fontsize=16)
plt.show()
In [339]:
plt.figure(figsize=(10,6))
corr_matrix = df.corr()
sns.heatmap(data=corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.suptitle("Feature Correlation Heatmap", fontsize=16)
plt.show()
In [81]:
sns.pairplot(data=df[["MedInc", "HouseAge", "AveRooms", "Target"]], diag_kind="kde")
plt.show()
Prg 3 - Develop a program to implement Principal Component Analysis (PCA) for reducing the dimensionality of theIris dataset from 4 features to 2.¶
In [340]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
In [341]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
In [342]:
iris = datasets.load_iris()
In [343]:
X = iris.data
y = iris.target
In [344]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
In [345]:
cov_matrix = np.cov(X_scaled.T)
In [346]:
cov_matrix
Out[346]:
array([[ 1.00671141, -0.11835884, 0.87760447, 0.82343066],
[-0.11835884, 1.00671141, -0.43131554, -0.36858315],
[ 0.87760447, -0.43131554, 1.00671141, 0.96932762],
[ 0.82343066, -0.36858315, 0.96932762, 1.00671141]])
In [347]:
evalues, evectors = np.linalg.eig(cov_matrix)
In [348]:
evalues
Out[348]:
array([2.93808505, 0.9201649 , 0.14774182, 0.02085386])
In [355]:
evectors
Out[355]:
array([[ 0.52106591, -0.37741762, -0.71956635, 0.26128628],
[-0.26934744, -0.92329566, 0.24438178, -0.12350962],
[ 0.5804131 , -0.02449161, 0.14212637, -0.80144925],
[ 0.56485654, -0.06694199, 0.63427274, 0.52359713]])
In [350]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection="3d")
colors = ["red", "green", "blue"]
labels = iris.target_names
for i in range(len(colors)):
ax.scatter(X_scaled[y == i, 0], X_scaled[y == i, 1], X_scaled[y == i, 2], color=colors[i], label=labels[i])
ax.set_xlabel("Sepal Length")
ax.set_ylabel("Sepal Width")
ax.set_zlabel("Petal Length")
ax.set_title("3D Visualization of Iris Data Before PCA")
plt.legend()
plt.show()
In [130]:
U, S, Vt = np.linalg.svd(X_scaled, full_matrices=False)
In [131]:
S
Out[131]:
array([20.92306556, 11.7091661 , 4.69185798, 1.76273239])
In [132]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
In [133]:
explained_var = pca.explained_variance_ratio_
In [134]:
print(f"Explained Variance by PC1: {explained_var[0]:.2f}")
print(f"Explained Variance by PC2: {explained_var[1]:.2f}")
Explained Variance by PC1: 0.73 Explained Variance by PC2: 0.23
In [135]:
plt.figure(figsize=(8, 6))
for i in range(len(colors)):
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], color=colors[i], label=labels[i])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA on Iris Dataset (Dimensionality Reduction)')
plt.legend()
plt.grid()
plt.show()
In [136]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection="3d")
for i in range(len(colors)):
ax.scatter(X_scaled[y == i, 0], X_scaled[y == i, 1], X_scaled[y == i, 2], color=colors[i], label=labels[i])
for i in range(3): # Plot first three eigenvectors
ax.quiver(0, 0, 0, evectors[i, 0], evectors[i, 1], evectors[i, 2], color='black', length=1)
ax.set_xlabel('Sepal Length')
ax.set_ylabel('Sepal Width')
ax.set_zlabel('Petal Length')
ax.set_title('3D Data with Eigenvectors')
plt.legend()
plt.show()
Prg 4 - For a given set of training data examples stored in a .CSV file, implement and demonstrate the Find-S algorithm to output a description of the set of all hypotheses consistent with the training examples.¶
In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
In [138]:
data = pd.read_csv(r"./training_data2.csv")
In [139]:
data
Out[139]:
| Experience | Qualification | Skill | Age | Hired | |
|---|---|---|---|---|---|
| 0 | Yes | Masters | Python | 30 | Yes |
| 1 | Yes | Bachelors | Python | 25 | Yes |
| 2 | No | Bachelors | Java | 28 | No |
| 3 | Yes | Masters | Java | 40 | Yes |
| 4 | No | Masters | Python | 35 | No |
In [140]:
def find_s_alg(data):
attr = data.iloc[:, :-1].values
target = data.iloc[:, -1].values
for i in range(len(target)):
if target[i] == "Yes":
hypo = attr[i].copy()
break
for i in range(len(target)):
if target[i] == "Yes":
for j in range(len(hypo)):
if hypo[j] != attr[i][j]:
hypo[j] = "?"
return hypo
In [142]:
final_hypo = find_s_alg(data)
final_hypo
Out[142]:
array(['Yes', '?', '?', '?'], dtype=object)
Prg 5 - Develop a program to implement k-Nearest Neighbour algorithm to classify the randomly generated 100 values of x in the range of [0,1]. Perform the following based on dataset generated.¶
In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
In [144]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
In [146]:
np.random.seed(42)
values = np.random.rand(100)
In [147]:
labels = []
for i in values[:50]:
if i <=0.5:
labels.append('Class1')
else:
labels.append('Class2')
In [148]:
labels += [None] * 50
In [150]:
print(labels)
['Class1', 'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class2', 'Class2', 'Class2', 'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1', 'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2', 'Class2', 'Class1', 'Class2', 'Class1', 'Class1', 'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2', 'Class1', 'Class2', 'Class2', 'Class1', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
In [151]:
data = {
"Point": [f"x{i+1}" for i in range(100)],
"Value": values,
"Label": labels
}
In [152]:
df = pd.DataFrame(data)
In [153]:
df.head()
Out[153]:
| Point | Value | Label | |
|---|---|---|---|
| 0 | x1 | 0.374540 | Class1 |
| 1 | x2 | 0.950714 | Class2 |
| 2 | x3 | 0.731994 | Class2 |
| 3 | x4 | 0.598658 | Class2 |
| 4 | x5 | 0.156019 | Class1 |
In [154]:
df.nunique()
Out[154]:
Point 100 Value 100 Label 2 dtype: int64
In [155]:
df.shape
Out[155]:
(100, 3)
In [156]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Point 100 non-null object 1 Value 100 non-null float64 2 Label 50 non-null object dtypes: float64(1), object(2) memory usage: 2.5+ KB
In [158]:
df.describe().T
Out[158]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Value | 100.0 | 0.470181 | 0.297489 | 0.005522 | 0.193201 | 0.464142 | 0.730203 | 0.986887 |
In [159]:
df.isnull().sum()
Out[159]:
Point 0 Value 0 Label 50 dtype: int64
In [162]:
num_col = df.select_dtypes(include=['int', 'float']).columns
df[num_col].hist(figsize=(12, 8), bins=30, edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()
In [176]:
labelled_df = df[df["Label"].notna()]
X_train = labelled_df[["Value"]]
y_train = labelled_df["Label"]
In [185]:
unlabelled_df = df[df["Label"].isna()]
X_test = unlabelled_df[["Value"]]
In [186]:
true_labels = ["Class1" if x <= 0.5 else "Class2" for x in values[50:]]
In [187]:
k_values = [1, 2, 3, 4, 5, 20, 30]
results = {}
accuracies = {}
In [188]:
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
results[k] = pred
acc = accuracy_score(true_labels, pred) * 100
accuracies[k] = acc
print(f"Accuracy for k={k}: {acc:.2f}%")
# Assign predictions back to the DataFrame for this k
unlabelled_df[f"Label_k{k}"] = pred
Accuracy for k=1: 100.00% Accuracy for k=2: 100.00% Accuracy for k=3: 98.00% Accuracy for k=4: 98.00% Accuracy for k=5: 98.00% Accuracy for k=20: 98.00% Accuracy for k=30: 100.00%
In [189]:
print(results)
{1: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class2', 'Class1',
'Class1', 'Class1'], dtype=object), 2: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class2', 'Class1',
'Class1', 'Class1'], dtype=object), 3: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class1',
'Class1', 'Class1'], dtype=object), 4: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class1',
'Class1', 'Class1'], dtype=object), 5: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class1',
'Class1', 'Class1'], dtype=object), 20: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class1',
'Class1', 'Class1'], dtype=object), 30: array(['Class2', 'Class2', 'Class2', 'Class2', 'Class2', 'Class2',
'Class1', 'Class1', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class1', 'Class1', 'Class2', 'Class1', 'Class2',
'Class1', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class1', 'Class1', 'Class1', 'Class1',
'Class2', 'Class2', 'Class2', 'Class1', 'Class1', 'Class2',
'Class2', 'Class2', 'Class2', 'Class1', 'Class2', 'Class1',
'Class1', 'Class1'], dtype=object)}
In [194]:
df1 = unlabelled_df.drop(columns=["Label"], axis=1)
df1
Out[194]:
| Point | Value | Label_k1 | Label_k2 | Label_k3 | Label_k4 | Label_k5 | Label_k20 | Label_k30 | |
|---|---|---|---|---|---|---|---|---|---|
| 50 | x51 | 0.969585 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 51 | x52 | 0.775133 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 52 | x53 | 0.939499 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 53 | x54 | 0.894827 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 54 | x55 | 0.597900 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 55 | x56 | 0.921874 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 56 | x57 | 0.088493 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 57 | x58 | 0.195983 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 58 | x59 | 0.045227 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 59 | x60 | 0.325330 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 60 | x61 | 0.388677 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 61 | x62 | 0.271349 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 62 | x63 | 0.828738 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 63 | x64 | 0.356753 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 64 | x65 | 0.280935 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 65 | x66 | 0.542696 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 66 | x67 | 0.140924 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 67 | x68 | 0.802197 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 68 | x69 | 0.074551 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 69 | x70 | 0.986887 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 70 | x71 | 0.772245 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 71 | x72 | 0.198716 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 72 | x73 | 0.005522 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 73 | x74 | 0.815461 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 74 | x75 | 0.706857 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 75 | x76 | 0.729007 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 76 | x77 | 0.771270 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 77 | x78 | 0.074045 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 78 | x79 | 0.358466 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 79 | x80 | 0.115869 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 80 | x81 | 0.863103 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 81 | x82 | 0.623298 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 82 | x83 | 0.330898 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 83 | x84 | 0.063558 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 84 | x85 | 0.310982 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 85 | x86 | 0.325183 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 86 | x87 | 0.729606 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 87 | x88 | 0.637557 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 88 | x89 | 0.887213 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 89 | x90 | 0.472215 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 90 | x91 | 0.119594 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 91 | x92 | 0.713245 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 92 | x93 | 0.760785 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 93 | x94 | 0.561277 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 94 | x95 | 0.770967 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 95 | x96 | 0.493796 | Class1 | Class1 | Class2 | Class2 | Class2 | Class2 | Class1 |
| 96 | x97 | 0.522733 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 | Class2 |
| 97 | x98 | 0.427541 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 98 | x99 | 0.025419 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
| 99 | x100 | 0.107891 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 | Class1 |
Prg 6 - Implement the non-parametric Locally Weighted Regression algorithm in order to fit data points. Select appropriate data set for your experiment and draw graphs¶
In [195]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
In [224]:
def g_kernel(x, x_query, tau):
return np.exp(- (x - x_query) ** 2 / (2 * tau ** 2))
In [225]:
def lwr(X, y, x_query, tau):
X_b = np.c_[np.ones(len(X)), X]
x_query_b = np.array([1, x_query])
W = np.diag(g_kernel(X, x_query, tau))
theta = np.linalg.inv(X_b.T @ W @ X_b) @ X_b.T @ W @ y
return x_query_b @ theta
In [226]:
X = np.array([1, 2, 3, 4, 5])
y = np.array([1, 2, 1.3, 3.75, 2.25])
In [227]:
x_query = 3
tau = 1
In [228]:
y_pred = lwr(X, y, x_query, tau)
In [229]:
plt.figure(figsize=(8, 6))
plt.scatter(X, y, color='blue', label='Data Points')
plt.scatter(x_query, y_pred, color='red', label=f'Prediction at x={x_query}')
weights = g_kernel(X, x_query, tau)
for i in range(len(X)):
plt.plot([X[i], X[i]], [y[i], y[i] - weights[i]], 'k-', lw=1)
plt.scatter(X[i], y[i], s=weights[i] * 200, color='green', alpha=0.5)
plt.title("Locally Weighted Regression (LWR)")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()
In [212]:
from sklearn.linear_model import LinearRegression
In [233]:
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([1, 3, 2, 4, 3.5, 5, 6, 7, 6.5, 8])
# Query points for LWR
X_query = np.linspace(1, 10, 100)
tau = 1.0 # Bandwidth parameter
# Compute LWR predictions
y_lwr = np.array([lwr(X, y, x_q, tau) for x_q in X_query])
# Simple Linear Regression
lin_reg = LinearRegression()
X_reshaped = X.reshape(-1, 1)
lin_reg.fit(X_reshaped, y)
y_lin = lin_reg.predict(X_query.reshape(-1, 1))
# Visualizing
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Data Points')
plt.plot(X_query, y_lin, color='black', linestyle='dashed', label='Simple Linear Regression')
plt.plot(X_query, y_lwr, color='red', label='Locally Weighted Regression')
plt.title("Comparison: Simple Linear Regression vs. Locally Weighted Regression")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()
In [238]:
# Complex Dataset
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([1, 3, 2, 4, 3.5, 5, 6, 7, 6.5, 8])
# Query points for LWR
X_query = np.linspace(1, 10, 100)
tau_values = [0.1, 0.5, 1.0, 5.0, 10.0] # Different bandwidth values
def lwr(X, y, x_query, tau):
X_b = np.c_[np.ones(len(X)), X] # Add bias term (Intercept)
x_query_b = np.array([1, x_query]) # Query point with bias term
W = np.diag(gaussian_kernel(X, x_query, tau)) # Compute weights
# Compute theta using pseudo-inverse to avoid singular matrix error
theta = np.linalg.pinv(X_b.T @ W @ X_b) @ X_b.T @ W @ y
return x_query_b @ theta # Return prediction
# Simple Linear Regression
lin_reg = LinearRegression()
X_reshaped = X.reshape(-1, 1)
lin_reg.fit(X_reshaped, y)
y_lin = lin_reg.predict(X_query.reshape(-1, 1))
# Visualizing
plt.figure(figsize=(12, 8))
plt.scatter(X, y, color='blue', label='Data Points')
plt.plot(X_query, y_lin, color='black', linestyle='dashed', label='Simple Linear Regression')
# Plot LWR for different tau values
colors = ['red', 'green', 'purple', 'orange', 'brown']
for tau, color in zip(tau_values, colors):
y_lwr = np.array([lwr(X, y, x_q, tau) for x_q in X_query])
plt.plot(X_query, y_lwr, color=color, label=f'LWR (τ={tau})')
plt.title("Effect of Different τ Values in Locally Weighted Regression")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()
Prg 8 - Develop a program to demonstrate the working of the decision tree algorithm. Use Breast Cancer Data set for building the decision tree and apply this knowledge to classify a new sample.¶
In [239]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [240]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
In [241]:
data = pd.read_csv(r'./WisconsinBreastCancerdataset.csv')
In [242]:
pd.set_option('display.max_columns', None)
In [243]:
data.head()
Out[243]:
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
In [244]:
data.shape
Out[244]:
(569, 33)
In [245]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null object 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 concave points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 fractal_dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 concave points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 fractal_dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 concave points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 fractal_dimension_worst 569 non-null float64 32 Unnamed: 32 0 non-null float64 dtypes: float64(31), int64(1), object(1) memory usage: 146.8+ KB
In [246]:
data.diagnosis.unique()
Out[246]:
array(['M', 'B'], dtype=object)
In [247]:
data.duplicated().sum()
Out[247]:
0
In [248]:
df = data.drop(['id', 'Unnamed: 32'], axis=1)
In [251]:
df['diagnosis'] = df['diagnosis'].map({"M": 1, "B": 0}) # Malignant:1, Benign:0
In [253]:
df.describe().T
Out[253]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| diagnosis | 569.0 | 0.372583 | 0.483918 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.00000 |
| radius_mean | 569.0 | 14.127292 | 3.524049 | 6.981000 | 11.700000 | 13.370000 | 15.780000 | 28.11000 |
| texture_mean | 569.0 | 19.289649 | 4.301036 | 9.710000 | 16.170000 | 18.840000 | 21.800000 | 39.28000 |
| perimeter_mean | 569.0 | 91.969033 | 24.298981 | 43.790000 | 75.170000 | 86.240000 | 104.100000 | 188.50000 |
| area_mean | 569.0 | 654.889104 | 351.914129 | 143.500000 | 420.300000 | 551.100000 | 782.700000 | 2501.00000 |
| smoothness_mean | 569.0 | 0.096360 | 0.014064 | 0.052630 | 0.086370 | 0.095870 | 0.105300 | 0.16340 |
| compactness_mean | 569.0 | 0.104341 | 0.052813 | 0.019380 | 0.064920 | 0.092630 | 0.130400 | 0.34540 |
| concavity_mean | 569.0 | 0.088799 | 0.079720 | 0.000000 | 0.029560 | 0.061540 | 0.130700 | 0.42680 |
| concave points_mean | 569.0 | 0.048919 | 0.038803 | 0.000000 | 0.020310 | 0.033500 | 0.074000 | 0.20120 |
| symmetry_mean | 569.0 | 0.181162 | 0.027414 | 0.106000 | 0.161900 | 0.179200 | 0.195700 | 0.30400 |
| fractal_dimension_mean | 569.0 | 0.062798 | 0.007060 | 0.049960 | 0.057700 | 0.061540 | 0.066120 | 0.09744 |
| radius_se | 569.0 | 0.405172 | 0.277313 | 0.111500 | 0.232400 | 0.324200 | 0.478900 | 2.87300 |
| texture_se | 569.0 | 1.216853 | 0.551648 | 0.360200 | 0.833900 | 1.108000 | 1.474000 | 4.88500 |
| perimeter_se | 569.0 | 2.866059 | 2.021855 | 0.757000 | 1.606000 | 2.287000 | 3.357000 | 21.98000 |
| area_se | 569.0 | 40.337079 | 45.491006 | 6.802000 | 17.850000 | 24.530000 | 45.190000 | 542.20000 |
| smoothness_se | 569.0 | 0.007041 | 0.003003 | 0.001713 | 0.005169 | 0.006380 | 0.008146 | 0.03113 |
| compactness_se | 569.0 | 0.025478 | 0.017908 | 0.002252 | 0.013080 | 0.020450 | 0.032450 | 0.13540 |
| concavity_se | 569.0 | 0.031894 | 0.030186 | 0.000000 | 0.015090 | 0.025890 | 0.042050 | 0.39600 |
| concave points_se | 569.0 | 0.011796 | 0.006170 | 0.000000 | 0.007638 | 0.010930 | 0.014710 | 0.05279 |
| symmetry_se | 569.0 | 0.020542 | 0.008266 | 0.007882 | 0.015160 | 0.018730 | 0.023480 | 0.07895 |
| fractal_dimension_se | 569.0 | 0.003795 | 0.002646 | 0.000895 | 0.002248 | 0.003187 | 0.004558 | 0.02984 |
| radius_worst | 569.0 | 16.269190 | 4.833242 | 7.930000 | 13.010000 | 14.970000 | 18.790000 | 36.04000 |
| texture_worst | 569.0 | 25.677223 | 6.146258 | 12.020000 | 21.080000 | 25.410000 | 29.720000 | 49.54000 |
| perimeter_worst | 569.0 | 107.261213 | 33.602542 | 50.410000 | 84.110000 | 97.660000 | 125.400000 | 251.20000 |
| area_worst | 569.0 | 880.583128 | 569.356993 | 185.200000 | 515.300000 | 686.500000 | 1084.000000 | 4254.00000 |
| smoothness_worst | 569.0 | 0.132369 | 0.022832 | 0.071170 | 0.116600 | 0.131300 | 0.146000 | 0.22260 |
| compactness_worst | 569.0 | 0.254265 | 0.157336 | 0.027290 | 0.147200 | 0.211900 | 0.339100 | 1.05800 |
| concavity_worst | 569.0 | 0.272188 | 0.208624 | 0.000000 | 0.114500 | 0.226700 | 0.382900 | 1.25200 |
| concave points_worst | 569.0 | 0.114606 | 0.065732 | 0.000000 | 0.064930 | 0.099930 | 0.161400 | 0.29100 |
| symmetry_worst | 569.0 | 0.290076 | 0.061867 | 0.156500 | 0.250400 | 0.282200 | 0.317900 | 0.66380 |
| fractal_dimension_worst | 569.0 | 0.083946 | 0.018061 | 0.055040 | 0.071460 | 0.080040 | 0.092080 | 0.20750 |
In [254]:
X = df.drop('diagnosis', axis=1) # Drop the 'diagnosis' column (target)
y = df['diagnosis']
In [255]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [256]:
model = DecisionTreeClassifier(criterion='entropy') #criteria = gini, entropy
model.fit(X_train, y_train)
model
Out[256]:
DecisionTreeClassifier(criterion='entropy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy')
In [263]:
import math
def entropy(column):
counts = column.value_counts()
prob = counts / len(column)
return -sum(prob * prob.apply(math.log2))
def condition_entropy(data, X, target):
feature_values = data[X].unique()
we = 0
for value in feature_values:
subset = data[data[X] == value]
we += (len(subset) / len(data)) * entropy(subset[target])
return we
def information_gained(data, X, target):
total_e = entropy(data[target])
we = condition_entropy(data, X, target)
return total_e - we
for feature in X:
ig = information_gained(df, feature, "diagnosis")
print(f"Information Gain for {feature}: {ig}")
Information Gain for radius_mean: 0.8607815854835991 Information Gain for texture_mean: 0.8357118798482908 Information Gain for perimeter_mean: 0.9267038614138748 Information Gain for area_mean: 0.9280305529818247 Information Gain for smoothness_mean: 0.7761788341876101 Information Gain for compactness_mean: 0.9091291689709926 Information Gain for concavity_mean: 0.9350604299589776 Information Gain for concave points_mean: 0.9420903069361305 Information Gain for symmetry_mean: 0.735036638169654 Information Gain for fractal_dimension_mean: 0.8361770160635639 Information Gain for radius_se: 0.9337337383910278 Information Gain for texture_se: 0.8642965239721755 Information Gain for perimeter_se: 0.9315454914704012 Information Gain for area_se: 0.925377169845925 Information Gain for smoothness_se: 0.9350604299589776 Information Gain for compactness_se: 0.9231889229252984 Information Gain for concavity_se: 0.9280305529818247 Information Gain for concave points_se: 0.8585933385629725 Information Gain for symmetry_se: 0.8181371874054084 Information Gain for fractal_dimension_se: 0.9174857375160954 Information Gain for radius_worst: 0.9003074642106167 Information Gain for texture_worst: 0.8634349686194988 Information Gain for perimeter_worst: 0.8985843535052632 Information Gain for area_worst: 0.9350604299589776 Information Gain for smoothness_worst: 0.7197189097252679 Information Gain for compactness_worst: 0.9183472928687721 Information Gain for concavity_worst: 0.9302187999024514 Information Gain for concave points_worst: 0.9148323543801957 Information Gain for symmetry_worst: 0.8453951399613433 Information Gain for fractal_dimension_worst: 0.8915544765281104
In [272]:
plt.figure(figsize=(22, 16))
plot_tree(model, filled=True, feature_names=X.columns, class_names=['Benign', 'Malignant'])
plt.show()
In [265]:
y_pred = model.predict(X_test)
y_pred
Out[265]:
array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
1, 0, 0, 1], dtype=int64)
In [266]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred) * 100
classification_rep = classification_report(y_test, y_pred)
# Print the results
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
Accuracy: 93.85964912280701
Classification Report:
precision recall f1-score support
0 0.93 0.97 0.95 71
1 0.95 0.88 0.92 43
accuracy 0.94 114
macro avg 0.94 0.93 0.93 114
weighted avg 0.94 0.94 0.94 114
In [267]:
df.head(1)
Out[267]:
| diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 17.99 | 10.38 | 122.8 | 1001.0 | 0.1184 | 0.2776 | 0.3001 | 0.1471 | 0.2419 | 0.07871 | 1.095 | 0.9053 | 8.589 | 153.4 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.6 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.1189 |
In [268]:
new = [[12.5, 19.2, 80.0, 500.0, 0.085, 0.1, 0.05, 0.02, 0.17, 0.06,
0.4, 1.0, 2.5, 40.0, 0.006, 0.02, 0.03, 0.01, 0.02, 0.003,
16.0, 25.0, 105.0, 900.0, 0.13, 0.25, 0.28, 0.12, 0.29, 0.08]]
y_pred = model.predict(new)
# Output the prediction (0 = Benign, 1 = Malignant)
if y_pred[0] == 0:
print("Prediction: Benign")
else:
print("Prediction: Malignant")
Prediction: Benign
Prg 9 - Develop a program to implement the Naive Bayesian classifier considering Olivetti Face Data set for training. Compute the accuracy of the classifier, considering a few test data sets.¶
In [273]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [274]:
from sklearn.datasets import fetch_olivetti_faces
data = fetch_olivetti_faces()
In [275]:
data.keys()
Out[275]:
dict_keys(['data', 'images', 'target', 'DESCR'])
In [276]:
print("Data Shape:", data.data.shape)
print("Target Shape:", data.target.shape)
print("There are {} unique persons in the dataset".format(len(np.unique(data.target))))
print("Size of each image is {}x{}".format(data.images.shape[1],data.images.shape[1]))
Data Shape: (400, 4096) Target Shape: (400,) There are 40 unique persons in the dataset Size of each image is 64x64
In [277]:
def print_faces(images, target, top_n):
# Ensure the number of images does not exceed available data
top_n = min(top_n, len(images))
# Set up figure size based on the number of images
grid_size = int(np.ceil(np.sqrt(top_n)))
fig, axes = plt.subplots(grid_size, grid_size, figsize=(15, 15))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.2, wspace=0.2)
for i, ax in enumerate(axes.ravel()):
if i < top_n:
ax.imshow(images[i], cmap='bone')
ax.axis('off')
ax.text(2, 12, str(target[i]), fontsize=9, color='red')
ax.text(2, 55, f"face: {i}", fontsize=9, color='blue')
else:
ax.axis('off')
plt.show()
In [278]:
print_faces(data.images, data.target, 400)
In [284]:
def display_unique_faces(pics):
fig = plt.figure(figsize=(24, 10)) # Set figure size
columns, rows = 10, 4 # Define grid dimensions
# Loop through grid positions and plot each image
for i in range(1, columns * rows + 1):
img_index = 10 * i - 1 # Calculate the image index
if img_index < pics.shape[0]: # Check for valid image index
img = pics[img_index, :, :]
ax = fig.add_subplot(rows, columns, i)
ax.imshow(img, cmap='gray')
ax.set_title(f"Person {i}", fontsize=14)
ax.axis('off')
plt.suptitle("There are 40 distinct persons in the dataset", fontsize=24)
plt.show()
In [285]:
display_unique_faces(data.images)
In [286]:
from sklearn.model_selection import train_test_split
X = data.data
Y = data.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)
print("x_train: ",x_train.shape)
print("x_test: ",x_test.shape)
x_train: (280, 4096) x_test: (120, 4096)
In [287]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
# Train the model
nb = GaussianNB()
nb.fit(x_train, y_train)
# Predict the test set results
y_pred = nb.predict(x_test)
# Calculate accuracy
nb_accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
# Display accuracy result
print(f"Naive Bayes Accuracy: {nb_accuracy}%")
Confusion Matrix: [[3 0 0 ... 0 0 0] [0 3 0 ... 0 0 0] [0 0 2 ... 0 0 0] ... [0 0 0 ... 3 0 0] [0 0 0 ... 0 0 0] [0 0 0 ... 0 0 4]] Naive Bayes Accuracy: 74.17%
In [288]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Initialize and fit Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(x_train, y_train)
# Predict the test set results
y_pred = nb.predict(x_test)
# Calculate accuracy
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(f"Multinomial Naive Bayes Accuracy: {accuracy}%")
Multinomial Naive Bayes Accuracy: 75.83%
In [294]:
# Calculate the number of misclassified images
misclassified_idx = np.where(y_pred != y_test)[0]
num_misclassified = len(misclassified_idx)
# Print the number of misclassified images and accuracy
print(f"Number of misclassified images: {num_misclassified}")
print(f"Total images in test set: {len(y_test)}")
print(f"Accuracy: {round((1 - num_misclassified / len(y_test)) * 100, 2)}%")
# Visualize some of the misclassified images
n_misclassified_to_show = min(num_misclassified, 5) # Show up to 5 misclassified i
plt.figure(figsize=(10, 5))
for i in range(n_misclassified_to_show):
idx = misclassified_idx[i]
plt.subplot(1, n_misclassified_to_show, i + 1)
plt.imshow(x_test[idx].reshape(64, 64), cmap='gray')
plt.title(f"True: {y_test[idx]}, Pred: {y_pred[idx]}")
plt.axis('off')
plt.show()
Number of misclassified images: 29 Total images in test set: 120 Accuracy: 75.83%
In [295]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
# Binarize the test labels
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
# Get predicted probabilities for each class
y_pred_prob = nb.predict_proba(x_test)
# Calculate and print AUC for each class
for i in range(y_test_bin.shape[1]):
roc_auc = roc_auc_score(y_test_bin[:, i], y_pred_prob[:, i])
print(f"Class {i} AUC: {roc_auc:.2f}")
Class 0 AUC: 0.69 Class 1 AUC: 1.00 Class 2 AUC: 0.99 Class 3 AUC: 0.98 Class 4 AUC: 0.97 Class 5 AUC: 1.00 Class 6 AUC: 1.00 Class 7 AUC: 0.89 Class 8 AUC: 1.00 Class 9 AUC: 1.00 Class 10 AUC: 1.00 Class 11 AUC: 1.00 Class 12 AUC: 0.99 Class 13 AUC: 1.00 Class 14 AUC: 1.00 Class 15 AUC: 1.00 Class 16 AUC: 0.50 Class 17 AUC: 0.18 Class 18 AUC: 0.42 Class 19 AUC: 0.87 Class 20 AUC: 0.31 Class 21 AUC: 0.68 Class 22 AUC: 0.20 Class 23 AUC: 0.44 Class 24 AUC: 0.25 Class 25 AUC: 0.55 Class 26 AUC: 0.76 Class 27 AUC: 0.75 Class 28 AUC: 0.51 Class 29 AUC: 0.70 Class 30 AUC: 0.50 Class 31 AUC: 0.19 Class 32 AUC: 0.41 Class 33 AUC: 0.41 Class 34 AUC: 0.22 Class 35 AUC: 0.04 Class 36 AUC: 0.87 Class 37 AUC: 0.82 Class 38 AUC: 0.51
Prg 10 - Develop a program to implement k-means clustering using Wisconsin Breast Cancer data set and visualize the clustering result.¶
In [296]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')
In [297]:
data = pd.read_csv(r"./WisconsinBreastCancerdataset.csv")
In [298]:
data.head()
Out[298]:
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | radius_se | texture_se | perimeter_se | area_se | smoothness_se | compactness_se | concavity_se | concave points_se | symmetry_se | fractal_dimension_se | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | 1.0950 | 0.9053 | 8.589 | 153.40 | 0.006399 | 0.04904 | 0.05373 | 0.01587 | 0.03003 | 0.006193 | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | 0.5435 | 0.7339 | 3.398 | 74.08 | 0.005225 | 0.01308 | 0.01860 | 0.01340 | 0.01389 | 0.003532 | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | 0.7456 | 0.7869 | 4.585 | 94.03 | 0.006150 | 0.04006 | 0.03832 | 0.02058 | 0.02250 | 0.004571 | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | 0.4956 | 1.1560 | 3.445 | 27.23 | 0.009110 | 0.07458 | 0.05661 | 0.01867 | 0.05963 | 0.009208 | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | 0.7572 | 0.7813 | 5.438 | 94.44 | 0.011490 | 0.02461 | 0.05688 | 0.01885 | 0.01756 | 0.005115 | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
In [299]:
data.shape
Out[299]:
(569, 33)
In [300]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null object 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 concave points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 fractal_dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 concave points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 fractal_dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 concave points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 fractal_dimension_worst 569 non-null float64 32 Unnamed: 32 0 non-null float64 dtypes: float64(31), int64(1), object(1) memory usage: 146.8+ KB
In [301]:
data.diagnosis.unique()
Out[301]:
array(['M', 'B'], dtype=object)
In [302]:
data.isnull().sum()
Out[302]:
id 0 diagnosis 0 radius_mean 0 texture_mean 0 perimeter_mean 0 area_mean 0 smoothness_mean 0 compactness_mean 0 concavity_mean 0 concave points_mean 0 symmetry_mean 0 fractal_dimension_mean 0 radius_se 0 texture_se 0 perimeter_se 0 area_se 0 smoothness_se 0 compactness_se 0 concavity_se 0 concave points_se 0 symmetry_se 0 fractal_dimension_se 0 radius_worst 0 texture_worst 0 perimeter_worst 0 area_worst 0 smoothness_worst 0 compactness_worst 0 concavity_worst 0 concave points_worst 0 symmetry_worst 0 fractal_dimension_worst 0 Unnamed: 32 569 dtype: int64
In [303]:
data.duplicated().sum()
Out[303]:
0
In [304]:
df = data.drop(['id', 'Unnamed: 32'], axis=1)
In [305]:
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0}) # Malignant:1, Benign:0
In [306]:
df.describe().T
Out[306]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| diagnosis | 569.0 | 0.372583 | 0.483918 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.00000 |
| radius_mean | 569.0 | 14.127292 | 3.524049 | 6.981000 | 11.700000 | 13.370000 | 15.780000 | 28.11000 |
| texture_mean | 569.0 | 19.289649 | 4.301036 | 9.710000 | 16.170000 | 18.840000 | 21.800000 | 39.28000 |
| perimeter_mean | 569.0 | 91.969033 | 24.298981 | 43.790000 | 75.170000 | 86.240000 | 104.100000 | 188.50000 |
| area_mean | 569.0 | 654.889104 | 351.914129 | 143.500000 | 420.300000 | 551.100000 | 782.700000 | 2501.00000 |
| smoothness_mean | 569.0 | 0.096360 | 0.014064 | 0.052630 | 0.086370 | 0.095870 | 0.105300 | 0.16340 |
| compactness_mean | 569.0 | 0.104341 | 0.052813 | 0.019380 | 0.064920 | 0.092630 | 0.130400 | 0.34540 |
| concavity_mean | 569.0 | 0.088799 | 0.079720 | 0.000000 | 0.029560 | 0.061540 | 0.130700 | 0.42680 |
| concave points_mean | 569.0 | 0.048919 | 0.038803 | 0.000000 | 0.020310 | 0.033500 | 0.074000 | 0.20120 |
| symmetry_mean | 569.0 | 0.181162 | 0.027414 | 0.106000 | 0.161900 | 0.179200 | 0.195700 | 0.30400 |
| fractal_dimension_mean | 569.0 | 0.062798 | 0.007060 | 0.049960 | 0.057700 | 0.061540 | 0.066120 | 0.09744 |
| radius_se | 569.0 | 0.405172 | 0.277313 | 0.111500 | 0.232400 | 0.324200 | 0.478900 | 2.87300 |
| texture_se | 569.0 | 1.216853 | 0.551648 | 0.360200 | 0.833900 | 1.108000 | 1.474000 | 4.88500 |
| perimeter_se | 569.0 | 2.866059 | 2.021855 | 0.757000 | 1.606000 | 2.287000 | 3.357000 | 21.98000 |
| area_se | 569.0 | 40.337079 | 45.491006 | 6.802000 | 17.850000 | 24.530000 | 45.190000 | 542.20000 |
| smoothness_se | 569.0 | 0.007041 | 0.003003 | 0.001713 | 0.005169 | 0.006380 | 0.008146 | 0.03113 |
| compactness_se | 569.0 | 0.025478 | 0.017908 | 0.002252 | 0.013080 | 0.020450 | 0.032450 | 0.13540 |
| concavity_se | 569.0 | 0.031894 | 0.030186 | 0.000000 | 0.015090 | 0.025890 | 0.042050 | 0.39600 |
| concave points_se | 569.0 | 0.011796 | 0.006170 | 0.000000 | 0.007638 | 0.010930 | 0.014710 | 0.05279 |
| symmetry_se | 569.0 | 0.020542 | 0.008266 | 0.007882 | 0.015160 | 0.018730 | 0.023480 | 0.07895 |
| fractal_dimension_se | 569.0 | 0.003795 | 0.002646 | 0.000895 | 0.002248 | 0.003187 | 0.004558 | 0.02984 |
| radius_worst | 569.0 | 16.269190 | 4.833242 | 7.930000 | 13.010000 | 14.970000 | 18.790000 | 36.04000 |
| texture_worst | 569.0 | 25.677223 | 6.146258 | 12.020000 | 21.080000 | 25.410000 | 29.720000 | 49.54000 |
| perimeter_worst | 569.0 | 107.261213 | 33.602542 | 50.410000 | 84.110000 | 97.660000 | 125.400000 | 251.20000 |
| area_worst | 569.0 | 880.583128 | 569.356993 | 185.200000 | 515.300000 | 686.500000 | 1084.000000 | 4254.00000 |
| smoothness_worst | 569.0 | 0.132369 | 0.022832 | 0.071170 | 0.116600 | 0.131300 | 0.146000 | 0.22260 |
| compactness_worst | 569.0 | 0.254265 | 0.157336 | 0.027290 | 0.147200 | 0.211900 | 0.339100 | 1.05800 |
| concavity_worst | 569.0 | 0.272188 | 0.208624 | 0.000000 | 0.114500 | 0.226700 | 0.382900 | 1.25200 |
| concave points_worst | 569.0 | 0.114606 | 0.065732 | 0.000000 | 0.064930 | 0.099930 | 0.161400 | 0.29100 |
| symmetry_worst | 569.0 | 0.290076 | 0.061867 | 0.156500 | 0.250400 | 0.282200 | 0.317900 | 0.66380 |
| fractal_dimension_worst | 569.0 | 0.083946 | 0.018061 | 0.055040 | 0.071460 | 0.080040 | 0.092080 | 0.20750 |
In [307]:
#dropped the Diagnosis (target) since clustering is unsupervised.
df.drop(columns=["diagnosis"], inplace=True) # Removing Target
In [308]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
In [309]:
# Apply PCA for Dimensionality Reduction
pca = PCA(n_components=2) # Reduce to 2 dimensions for visualization
X_pca = pca.fit_transform(X_scaled)
In [310]:
# Check explained variance ratio
explained_variance = pca.explained_variance_ratio_
total_explained_variance = np.sum(explained_variance)
print(f"Variance explained by PC1: {explained_variance[0]:.4f}")
print(f"Variance explained by PC2: {explained_variance[1]:.4f}")
print(f"Total variance explained by first 2 components: {total_explained_variance:.4f}")
Variance explained by PC1: 0.4427 Variance explained by PC2: 0.1897 Total variance explained by first 2 components: 0.6324
In [313]:
wcss = [] # Within-Cluster Sum of Squares
K_range = range(1,11)
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_pca)
wcss.append(kmeans.inertia_)
In [314]:
# Plot the Elbow Method Graph
plt.figure(figsize=(8, 5))
plt.plot(K_range, wcss, marker="o", linestyle="-")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("WCSS")
plt.title("Elbow Method to Find Optimal k")
plt.show()
In [316]:
#Apply K-Means Clustering with the optimal k (usually where elbow occurs, k=2)
optimal_k = 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca)
In [324]:
# Step 7: Visualize the Clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c='red', marker='X', label='Centroids')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("K-Means Clustering after PCA")
plt.legend()
plt.show()
In [ ]: